/*          
    Purpose: This file merges the output of 
             2a and 2b, tags discrepancies in father
             occupation recall between fathers and 
             adult children, creates logged measures of 
             income for fathers and adult children, and 
             creates ranked measures in both generations.

    Creates: PSID_retrospective_analysis.dta 
*/
clear 
set more off

cd "$Mydirectory1/1_DataSources/PSID"

*-------------------------------------------------------------------*
*-------------------------------------------------------------------*

**************************************
*** MERGE ADULT CHILDREN AND FATHERS
**************************************

	use ./output/PSID_sons_retrospective.dta, clear
 
	merge m:1 father_id using ./output/Fathers_modaloccs.dta
	drop if _merge==1
	drop _merge

	sort son_id year	
	
************************************************
/* MISTAKES IN RECALL OF RETROSPECTIVE FATHER 
   OCCUPATION AMONG ADULT CHILDREN   */
************************************************

	preserve
	
	keep if firstobs_son==1
	
	keep occ_* father_occ_* number_* mode_occ_* mode_occ_son byr divorced race_son father_id son_id
	gen black = race_son==2
	
	gen yes =.

	* Dummy: Yes = adult child's answer matches Dad's answer
	forval i=1(1)5 {
		forval j=1(1)10 {
			replace yes = 1 if yes==. & occ_`i'==father_occ_`j' & occ_`i'!=. & father_occ_`j'!=.
		}
	}
	* How often do adult children get it right?
	tab yes, m
	replace yes=0 if yes==.
	tab yes,m 
	
	areg yes divorced, cluster(father_id) abs(byr)
	areg yes black, cluster(father_id) abs(byr)
	
	* What are common mistakes?
	sort son_id father_id
	egen groups = group(mode_occ_son_max mode_occ_30to50_max)
	tab groups if yes==0, sort
	
	tab mode_occ_son_max mode_occ_30to50_max if groups==148 
	tab mode_occ_son_max mode_occ_30to50_max if groups==131 
	tab mode_occ_son_max mode_occ_30to50_max if groups==135 
	tab mode_occ_son_max mode_occ_30to50_max if groups==154 
	tab mode_occ_son_max mode_occ_30to50_max if groups==143
	tab mode_occ_son_max mode_occ_30to50_max if groups==125
	
	keep son_id father_id yes
	rename yes occ_match 
	
	tempfile mistakes
	save `mistakes'
	
	restore
	
**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

************************************************
/* BRING IN RETROSPECTIVE FATHER 
   INCOME SCORES (AS PROVIDED BY 
   ADULT CHILDREN)  */
************************************************
	
* Regular retrospective (non-modal)
	rename father_occ_retrospective fatheroccej
	rename south_retrospective south_merge 
	rename race_son race
	
	merge m:1 fatheroccej race south_merge using ../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta
	assert fatheroccej==. | race==. | south_merge==. if _merge==1
	drop if _merge==2
	drop _merge
	
	drop  avg_incwage* avgincwage* avg_inctot*
	
	rename avg_HHinc_byr_bys_CWfix HHincome_40_retro 
	rename avg_HHinc_1970_byocc_byr_bys HHincome_70_retro
	rename avg_HHinc_1980_byocc_byr_bys HHincome_80_retro
	
	rename fatheroccej father_occ_retrospective
	
* Modal retrospective occupation 
	rename mode_occ_son fatheroccej
	
	merge m:1 fatheroccej race south_merge using ../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta
	assert fatheroccej==. | race==. | south_merge==. if _merge==1
	drop if _merge==2
	drop _merge

	drop  avg_incwage* avgincwage* avg_inctot*
	
	rename avg_HHinc_byr_bys_CWfix HHincome_40_mode_retro
	rename avg_HHinc_1970_byocc_byr_bys HHincome_70_mode_retro
	rename avg_HHinc_1980_byocc_byr_bys HHincome_80_mode_retro
	
	rename fatheroccej father_mode_retro_occ
	
/* Modal retrospective occupation when adult child 
   between ages 30 and 50 */
	rename mode_occ_son30to50_max fatheroccej
	
	merge m:1 fatheroccej race south_merge using ../CensusData/output/IncomeScores_Coarsened_byrace_bysouth.dta
	assert fatheroccej==. | race==. | south_merge==. if _merge==1
	drop if _merge==2
	drop _merge
	
	drop  avg_incwage* avgincwage* avg_inctot*

	rename avg_HHinc_byr_bys_CWfix HHincome_40_mode_retro_30to50
	rename avg_HHinc_1970_byocc_byr_bys HHincome_70_mode_retro_30to50
	rename avg_HHinc_1980_byocc_byr_bys HHincome_80_mode_retro_30to50
	
	rename fatheroccej mode_occ_son30to50
	rename south_merge south_mode_son2

**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

**********************
* LIGHT CLEANING
**********************

	label var year "Year of interview"
	label var father_id "Father 1968 ID"
	
	gen agesq = age_s*age_s
	rename age_s age 
	label var agesq "Age squared" 
	
	//Five year age bins (for adult children)
	gen bin=1 if age<20
	replace bin=2 if age>=20 & age<25
	replace bin=3 if age>=25 & age<30
	replace bin=4 if age>=30 & age<35
	replace bin=5 if age>=35 & age<40
	replace bin=6 if age>=40 & age<45
	replace bin=7 if age>=45 & age<50
	replace bin=8 if age>=50 & age<55
	replace bin=9 if age>=55 & age<60
	replace bin=10 if age>=60 & age<65
	replace bin=11 if age>=70 & age<.
	
**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

*********************
*** LOG INCOME
*********************
	
	//adult children
	gen log_child_around40 = log(son_totfaminc_age40+1)
	gen log_child_around40_bin = log(son_totfaminc_age40_bin)

	gen log_child = log(son_totfaminc)
	gen log_child_bin = log(son_totfaminc_bin)
	
	label var log_child_around40 "Log adult child income, around 40, not binned"
	label var log_child_around40_bin "Log adult child income, around 40, binned"
	label var log_child "Log adult child income, in this year"
	label var log_child_bin "Log adult child income, in this year, binned"
		
	//fathers
		* Actual		
			gen log_father_actual_1yrs = log(mean_totfaminc_1+1) 
			gen log_father_actual_5yrs = log(mean_totfaminc_5+1) 
			gen log_father_actual_10yrs = log(mean_totfaminc_10+1) 
			
			foreach x in 1 5 10 {
				label var log_father_actual_`x'yrs "Father actual income, `x' yrs around 40"
			}

			//Log actual income around 1970
			gen log_father_around1970_3yrs = log(mean_HHinc_around1970_3yr)
			gen log_father_around1970_1yr = log(mean_HHinc_around1970_1yr)
			
			label var log_father_around1970_3yrs "Log father income score (1970) using 3 years around 1970"
			label var log_father_around1970_1yr "Log father income score (1970) using 1 year around 1970"
		
		* Predicted
		foreach yy in 40 70 80 {
		
			//income scores using different number of years
			gen log_father_1yrs_`yy' = log(mean_HHincome_19`yy'_1)
			gen log_father_5yrs_`yy' = log(mean_HHincome_19`yy'_5)
			gen log_father_10yrs_`yy' = log(mean_HHincome_19`yy'_10)
			
			//retrospective answers from adult children
			gen log_father_retro_`yy' = log(HHincome_`yy'_retro)
			gen log_father_retro_mode_`yy' = log(HHincome_`yy'_mode_retro)
			gen log_father_retro_mode_30to50_`yy' = log(HHincome_`yy'_mode_retro_30to50)
		}
		
		foreach yy in 40 70 80 {
		
		foreach x in 1 5 10 {
		label var log_father_`x'yrs_`yy' "Father income score (19`yy') using `x' years"
		}
		
		label var log_father_retro_`yy' "Father income score (19`yy') using retrospective answer in this year"
		label var log_father_retro_mode_`yy' "Father income score (19`yy') using modal retrospective answer"
		label var log_father_retro_mode_30to50_`yy' "Father income score (19`yy') using modal retrospective answer between age 30-50"
		}

*******************************************		
/* Winsorized version of father's actual 
   income---cut off 2.5 percent on top 
   and bottom */
*******************************************		
	preserve
		keep if firstobs_son==1
		
		winsor mean_totfaminc_1, p(0.025) gen(mean_father_totfaminc_1yrs_win)
		winsor mean_totfaminc_5, p(0.025) gen(mean_father_totfaminc_5yrs_win)
		winsor mean_totfaminc_10, p(0.025) gen(mean_father_totfaminc_10yrs_win)
		
		keep son_id father_id *_win
		tempfile tempie
		save `tempie'
	restore
	
	merge m:1 son_id father_id using `tempie'
	drop _merge 
	
	gen log_father_actualwin_1yrs = log(mean_father_totfaminc_1yrs_win) 
	gen log_father_actualwin_5yrs = log(mean_father_totfaminc_5yrs_win) 
	gen log_father_actualwin_10yrs = log(mean_father_totfaminc_10yrs_win) 
	
	foreach x in 1 5 10 {
		label var log_father_actualwin_`x'yrs "Father income using `x' years, winsorized"
	}
	

* Save tempfile
	tempfile fulldata
	save `fulldata'
	
	local bin "bin"

**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

*********************
*** RANK INCOME
*********************
	
	forval i=1997(2)2015 {

		preserve
		keep if year==`i'
		local weight "xsection_weight"
		
	* Adult children, total family income in each year
		egen rank_child_`i' = xtile(son_totfaminc), by(`bin') nq(100) weight(`weight')
		egen rank_child_bin_`i' = xtile(son_totfaminc_bin), by(`bin') nq(100) weight(`weight')
		
		label var rank_child_`i' "Ranked child income, year `i'"
		label var rank_child_bin_`i' "Ranked child income, year `i', binned"
		
	* Fathers, annual retrospective answer
		foreach yy in 40 70 80 {
			egen rank_father_retro_`yy'_`i' = xtile(HHincome_`yy'_retro), by(`bin') nq(100) weight(`weight')
			label var rank_father_retro_`yy'_`i' "Ranked father income score (19`yy'), `i' year"
		}
		
	* Fathers, 1970 actual income
		foreach x in 1 3 {
			egen rank_father_around1970_`x'yr = xtile(mean_HHinc_around1970_`x'yr), by(`bin') nq(100) weight(`weight')
			label var rank_father_around1970_`x'yr "Ranked father income score (1970) around 1970, using `x' years"
		}
		
	* Fathers: Actual (1 and 5 years) and 1970 income score 
		foreach x in 1 5 {
			egen rank_father_actual_`x'yrs_`i' = xtile(mean_father_totfaminc_`x'yrs_win) , by(`bin') nq(100) weight(`weight') 
			label var rank_father_actual_`x'yrs_`i' "Rank father income, `x' years, ranked within `i' survey"
		}	
		foreach yy in 70 {
			foreach x in 1 {
				egen rank_father_`x'yrs_`i' = xtile(mean_HHincome_19`yy'_`x') , by(`bin') nq(100) weight(`weight')
				label var rank_father_`x'yrs_`i' "Rank father income score (19`yy'), using `x' years around 40, within `i' survey"
			}
		}	
		
		keep son_id father_id year rank_*

		tempfile ranks_`i'
		save `ranks_`i''
		
		restore
		
	}
		
* Other ranks

		tab year_totfaminc_age40 if firstobs_son==1
		keep if firstobs_son==1
		
		local weight "weight_totfaminc" 
		count if `weight'==0
		count if `weight'==.
		
	* Rank around age 40 for child
		egen rank_child_around40 = xtile(son_totfaminc_age40) , by(`bin') nq(100) weight(`weight')
		egen rank_child_around40_bin = xtile(son_totfaminc_age40_bin) , by(`bin') nq(100) weight(`weight')
		
		label var rank_child_around40 "Rank child, around age 40"
		label var rank_child_around40_bin "Rank child, around age 40, binned"
		
	* Fathers: ranked total family income around 40 (averaged over i years)
		foreach i in 1 5 10 {
			egen rank_father_actual_`i'yrs = xtile(mean_totfaminc_`i') , by(`bin') nq(100) weight(`weight')
			label var rank_father_actual_`i'yrs "Rank father income, `i' years"
		}
	
	* Fathers: actual income using i years around 40	
		foreach yy in 40 70 80 {
			foreach i in 1 5 10 {
				egen rank_father_`i'yrs_`yy' = xtile(mean_HHincome_19`yy'_`i') , by(`bin') nq(100) weight(`weight')
				label var rank_father_`i'yrs_`yy' "Rank father income score (19`yy'), using `i' years around 40"
			}
		}
		
	
	* Fathers: modal retrospective income score
		foreach yy in 40 70 80 {
			egen rank_father_retro_mode_`yy' = xtile(HHincome_`yy'_mode_retro), by(`bin') nq(100) weight(`weight')
			label var rank_father_retro_mode_`yy' "Rank father income score (19`yy'), modal retrospective answer"

			egen rank_father_retro_mode30to50_`yy' = xtile(HHincome_`yy'_mode_retro_30to50), by(`bin') nq(100) weight(`weight')
			label var rank_father_retro_mode30to50_`yy' "Rank father income score (19`yy'), modal retrospective answer b'w 30-50"
		}	
			
		
		keep son_id father_id rank_*
		tempfile ranks
		save `ranks'

**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

**************************		
*** Merge all ranks
**************************		

		use `fulldata', clear
		merge m:1 son_id father_id using `ranks'
		drop _merge 
		
		forval i=1997(2)2015 {
			merge 1:1 son_id father_id year using `ranks_`i''
			drop _merge
		}

**-----------------------------------------------------------------------------**
**-----------------------------------------------------------------------------**

*********************************
* CREATE FINAL RANKED MEASURES
*********************************

	gen rank_child = .
	gen rank_child_bin=.
	forval i=1997(2)2015 {
		replace rank_child = rank_child_`i' if year==`i'
		replace rank_child_bin = rank_child_bin_`i' if year==`i'
	}
	
	foreach yy in 40 70 80 {
		gen rank_father_retro_`yy'=.
	}
	foreach yy in 40 70 80 {
		forval i=1997(2)2015 {
			replace rank_father_retro_`yy' = rank_father_retro_`yy'_`i' if year==`i'
		}
	}
	
* Merge in tempfile of mistakes in recall
	merge m:1 son_id father_id using `mistakes'

* Save	
	compress 
	save ./output/PSID_retrospective_analysis.dta, replace
